Visión general del dataset Vamos a cargar el datset para analizar los diferentes valores de cada variable.
df_raw <- read.csv("accidents_2017.csv", sep=',', encoding='UTF-8')
dim(df_raw)
## [1] 10339 15
str(df_raw)
## 'data.frame': 10339 obs. of 15 variables:
## $ Id : Factor w/ 10335 levels "2017S000001 ",..: 7990 6890 9761 5946 4210 7345 4079 10230 4747 3529 ...
## $ District.Name : Factor w/ 11 levels "Ciutat Vella",..: 11 11 11 11 8 8 8 8 8 8 ...
## $ Neighborhood.Name: Factor w/ 74 levels "Baró de Viver",..: 69 69 69 69 12 12 12 12 12 12 ...
## $ Street : Factor w/ 4253 levels " ORTIGOSA ",..: 2672 2680 2153 2677 2098 1959 3974 1938 1938 2357 ...
## $ Weekday : Factor w/ 7 levels "Friday","Monday",..: 1 1 1 1 5 7 3 6 2 7 ...
## $ Month : Factor w/ 12 levels "April","August",..: 11 12 3 6 9 12 9 3 7 9 ...
## $ Day : int 13 1 8 21 25 20 20 26 12 3 ...
## $ Hour : int 8 13 21 2 14 12 21 20 15 20 ...
## $ Part.of.the.day : Factor w/ 3 levels "Afternoon","Morning",..: 2 2 1 3 1 2 1 1 1 1 ...
## $ Mild.injuries : int 2 2 5 1 1 1 1 2 1 1 ...
## $ Serious.injuries : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Victims : int 2 2 5 1 1 1 1 2 1 1 ...
## $ Vehicles.involved: int 2 2 2 2 3 2 2 1 1 1 ...
## $ Longitude : num 2.13 2.12 2.17 2.12 2.19 ...
## $ Latitude : num 41.3 41.3 41.4 41.3 41.4 ...
Distrito
# Varable Distrito
unique(df_raw["District.Name"])
## District.Name
## 1 Unknown
## 5 Sant Martí
## 161 Ciutat Vella
## 242 Eixample
## 586 Sants-Montjuïc
## 758 Les Corts
## 817 Sarrià-Sant Gervasi
## 982 Gràcia
## 1053 Horta-Guinardó
## 1119 Nou Barris
## 1179 Sant Andreu
Barrio
unique(df_raw["Neighborhood.Name"])
## Neighborhood.Name
## 1 Unknown
## 5 el Camp de l'Arpa del Clot
## 36 el Clot
## 44 Sant Martí de Provençals
## 64 la Verneda i la Pau
## 65 el Besòs i el Maresme
## 79 Provençals del Poblenou
## 87 el Poblenou
## 111 Diagonal Mar i el Front Marítim del Poblenou
## 124 el Parc i la Llacuna del Poblenou
## 161 el Raval
## 169 la Vila Olímpica del Poblenou
## 181 el Barri Gòtic
## 198 Sant Pere, Santa Caterina i la Ribera
## 200 la Barceloneta
## 242 Sant Antoni
## 253 la Nova Esquerra de l'Eixample
## 321 la Sagrada Família
## 323 la Dreta de l'Eixample
## 390 l'Antiga Esquerra de l'Eixample
## 579 el Fort Pienc
## 586 el Poble-sec
## 650 la Marina del Prat Vermell
## 704 la Marina de Port
## 715 la Bordeta
## 718 Hostafrancs
## 730 la Font de la Guatlla
## 738 Sants
## 739 Sants - Badal
## 758 les Corts
## 775 la Maternitat i Sant Ramon
## 814 Pedralbes
## 817 Sant Gervasi - Galvany
## 871 Sarrià
## 921 Sant Gervasi - la Bonanova
## 922 les Tres Torres
## 964 el Putxet i el Farró
## 981 Vallvidrera, el Tibidabo i les Planes
## 982 Vallcarca i els Penitents
## 994 el Coll
## 1004 la Vila de Gràcia
## 1011 la Salut
## 1032 el Camp d'en Grassot i Gràcia Nova
## 1053 Sant Genís dels Agudells
## 1055 Montbau
## 1061 la Teixonera
## 1063 la Font d'en Fargues
## 1065 la Clota
## 1068 la Vall d'Hebron
## 1071 Horta
## 1088 Can Baró
## 1094 el Carmel
## 1101 el Baix Guinardó
## 1103 el Guinardó
## 1119 Torre Baró
## 1124 les Roquetes
## 1125 Canyelles
## 1126 Vallbona
## 1129 Ciutat Meridiana
## 1132 Verdun
## 1133 la Guineueta
## 1137 la Trinitat Nova
## 1147 la Prosperitat
## 1152 Vilapicina i la Torre Llobeta
## 1161 Porta
## 1172 el Turó de la Peira
## 1173 Can Peguera
## 1179 la Sagrera
## 1180 el Congrés i els Indians
## 1191 Sant Andreu
## 1199 Navas
## 1216 la Trinitat Vella
## 1220 Baró de Viver
## 1223 el Bon Pastor
Número de calles
length(unique(df_raw[["Street"]]))
## [1] 4253
Parte del día
unique(df_raw["Part.of.the.day"])
## Part.of.the.day
## 1 Morning
## 3 Afternoon
## 4 Night
Víctimas leves
unique(df_raw["Mild.injuries"])
## Mild.injuries
## 1 2
## 3 5
## 4 1
## 14 0
## 17 4
## 87 3
## 626 8
## 682 9
## 1087 6
## 1317 7
## 8346 10
summary(df_raw["Mild.injuries"])
## Mild.injuries
## Min. : 0.000
## 1st Qu.: 1.000
## Median : 1.000
## Mean : 1.154
## 3rd Qu.: 1.000
## Max. :10.000
Víctimas graves
unique(df_raw["Serious.injuries"])
## Serious.injuries
## 1 0
## 14 1
## 865 2
## 9888 4
summary(df_raw["Serious.injuries"])
## Serious.injuries
## Min. :0.00000
## 1st Qu.:0.00000
## Median :0.00000
## Mean :0.02331
## 3rd Qu.:0.00000
## Max. :4.00000
Víctimas totales
unique(df_raw["Victims"])
## Victims
## 1 2
## 3 5
## 4 1
## 17 4
## 29 0
## 87 3
## 626 8
## 682 9
## 1087 6
## 1317 7
## 8346 10
summary(df_raw["Victims"])
## Victims
## Min. : 0.000
## 1st Qu.: 1.000
## Median : 1.000
## Mean : 1.179
## 3rd Qu.: 1.000
## Max. :10.000
Vehículos involucrados
unique(df_raw["Vehicles.involved"])
## Vehicles.involved
## 1 2
## 5 3
## 8 1
## 28 4
## 54 5
## 178 6
## 865 13
## 874 8
## 963 9
## 1069 7
## 2568 10
## 2600 0
## 3722 11
## 6938 14
summary(df_raw["Vehicles.involved"])
## Vehicles.involved
## Min. : 0.000
## 1st Qu.: 2.000
## Median : 2.000
## Mean : 1.921
## 3rd Qu.: 2.000
## Max. :14.000
Procedemos a aplicar estas transformaciones especificadas en la memoria PDF:
# Creamos nueva variable para el dataset tratado
df_proc <- df_raw
# Prescindimos de variable Street
df_proc <- subset(df_proc, select = -c(Street))
# Creamos la variable Hour.Span
df_proc$Hour.Span <- df_proc$Hour
df_proc[df_proc$Hour %in% c("0", "1", "2"),]$Hour.Span <- "00-03h"
df_proc[df_proc$Hour %in% c("3", "4", "5"),]$Hour.Span <- "03-06h"
df_proc[df_proc$Hour %in% c("6", "7", "8"),]$Hour.Span <- "06-09h"
df_proc[df_proc$Hour %in% c("9", "10", "11"),]$Hour.Span <- "09-12h"
df_proc[df_proc$Hour %in% c("12", "13", "14"),]$Hour.Span <- "12-15h"
df_proc[df_proc$Hour %in% c("15", "16", "17"),]$Hour.Span <- "15-18h"
df_proc[df_proc$Hour %in% c("18", "19", "20"),]$Hour.Span <- "18-21h"
df_proc[df_proc$Hour %in% c("21", "22", "23"),]$Hour.Span <- "21-00h"
df_proc$Hour.Span <- as.factor(df_proc$Hour.Span)
# Prescindimos de Hour y Part.of.day
df_proc <- subset(df_proc, select = -c(Hour, Part.of.the.day))
# Transformamos Day en una variable categórica Day.Span
df_proc$Day.Span <- df_proc$Day
df_proc[df_proc$Day %in% c("1", "2", "3", "4", "5"),]$Day.Span <- "1-5"
df_proc[df_proc$Day %in% c("6", "7", "8", "9", "10"),]$Day.Span <- "6-10"
df_proc[df_proc$Day %in% c("11", "12", "13", "14", "15"),]$Day.Span <- "11-15"
df_proc[df_proc$Day %in% c("16", "17", "18", "19", "20"),]$Day.Span <- "16-20"
df_proc[df_proc$Day %in% c("21", "22", "23", "24", "25"),]$Day.Span <- "21-25"
df_proc[df_proc$Day %in% c("26", "27", "28", "29", "30", "31"),]$Day.Span <- "26-31"
df_proc$Day.Span <- as.factor(df_proc$Day.Span)
df_proc <- subset(df_proc, select = -c(Day))
# Creamos nueva variable categórica Weekday.Weekend
df_proc$Weekday <- as.character(df_proc$Weekday)
df_proc$Weekday.Weekend <- df_proc$Weekday
df_proc[df_proc$Weekday %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"),]$Weekday.Weekend <- "Weekday"
df_proc[df_proc$Weekday %in% c("Saturday", "Sunday"),]$Weekday.Weekend <- "Weekend"
df_proc$Weekday <- as.factor(df_proc$Weekday)
df_proc$Weekday.Weekend <- as.factor(df_proc$Weekday.Weekend)
# Ordenamos las variables tipo factor
df_proc$Weekday <- factor(df_proc$Weekday,levels= c("Monday","Tuesday","Wednesday",
"Thursday","Friday","Saturday","Sunday"))
df_proc$Month <- factor(df_proc$Month,levels = c("January", "February","March",
"April", "May", "June",
"July","August","September","October",
"November","December"))
df_proc$Hour.Span <- factor(df_proc$Hour.Span,levels= c("00-03h", "03-06h","06-09h","09-12h",
"12-15h","15-18h","18-21h","21-00h"))
df_proc$Day.Span <- factor(df_proc$Day.Span,levels= c("1-5", "6-10","11-15","16-20","21-25","26-31"))
# Reordenamos las columnas del dataset
col_order <- c("Id", "District.Name", "Neighborhood.Name", "Longitude", "Latitude",
"Month", "Day.Span", "Hour.Span", "Weekday", "Weekday.Weekend", "Mild.injuries",
"Serious.injuries", "Victims", "Vehicles.involved")
df_proc <- df_proc[, col_order]
head(df_proc)
## Id District.Name Neighborhood.Name Longitude Latitude
## 1 2017S008429 Unknown Unknown 2.125624 41.34004
## 2 2017S007316 Unknown Unknown 2.120452 41.33943
## 3 2017S010210 Unknown Unknown 2.167356 41.36089
## 4 2017S006364 Unknown Unknown 2.124529 41.33767
## 5 2017S004615 Sant Martí el Camp de l'Arpa del Clot 2.185272 41.41636
## 6 2017S007775 Sant Martí el Camp de l'Arpa del Clot 2.183245 41.41634
## Month Day.Span Hour.Span Weekday Weekday.Weekend Mild.injuries
## 1 October 11-15 06-09h Friday Weekday 2
## 2 September 1-5 12-15h Friday Weekday 2
## 3 December 6-10 21-00h Friday Weekday 5
## 4 July 21-25 00-03h Friday Weekday 1
## 5 May 21-25 12-15h Thursday Weekday 1
## 6 September 16-20 12-15h Wednesday Weekday 1
## Serious.injuries Victims Vehicles.involved
## 1 0 2 2
## 2 0 2 2
## 3 0 5 2
## 4 0 1 2
## 5 0 1 3
## 6 0 1 2
Vamos a estudiar los atributos que podrían ser problemáticos, para detectar que atributos requieren de tratamiento.
str(df_proc)
## 'data.frame': 10339 obs. of 14 variables:
## $ Id : Factor w/ 10335 levels "2017S000001 ",..: 7990 6890 9761 5946 4210 7345 4079 10230 4747 3529 ...
## $ District.Name : Factor w/ 11 levels "Ciutat Vella",..: 11 11 11 11 8 8 8 8 8 8 ...
## $ Neighborhood.Name: Factor w/ 74 levels "Baró de Viver",..: 69 69 69 69 12 12 12 12 12 12 ...
## $ Longitude : num 2.13 2.12 2.17 2.12 2.19 ...
## $ Latitude : num 41.3 41.3 41.4 41.3 41.4 ...
## $ Month : Factor w/ 12 levels "January","February",..: 10 9 12 7 5 9 5 12 6 5 ...
## $ Day.Span : Factor w/ 6 levels "1-5","6-10","11-15",..: 3 1 2 5 5 4 4 6 3 1 ...
## $ Hour.Span : Factor w/ 8 levels "00-03h","03-06h",..: 3 5 8 1 5 5 8 7 6 7 ...
## $ Weekday : Factor w/ 7 levels "Monday","Tuesday",..: 5 5 5 5 4 3 6 2 1 3 ...
## $ Weekday.Weekend : Factor w/ 2 levels "Weekday","Weekend": 1 1 1 1 1 1 2 1 1 1 ...
## $ Mild.injuries : int 2 2 5 1 1 1 1 2 1 1 ...
## $ Serious.injuries : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Victims : int 2 2 5 1 1 1 1 2 1 1 ...
## $ Vehicles.involved: int 2 2 2 2 3 2 2 1 1 1 ...
# Validamos que no existen valores nulos
colSums(is.na(df_proc))
## Id District.Name Neighborhood.Name Longitude
## 0 0 0 0
## Latitude Month Day.Span Hour.Span
## 0 0 0 0
## Weekday Weekday.Weekend Mild.injuries Serious.injuries
## 0 0 0 0
## Victims Vehicles.involved
## 0 0
# Revisamos variables categóricas
# 27 registros Unknown
summary(df_proc$District.Name)
## Ciutat Vella Eixample Gràcia Horta-Guinardó
## 600 3029 531 743
## Les Corts Nou Barris Sant Andreu Sant Martí
## 726 542 597 1336
## Sants-Montjuïc Sarrià-Sant Gervasi Unknown
## 1104 1104 27
# 27 registros Unknown
summary(df_proc$Neighborhood.Name)
## Baró de Viver
## 31
## Can Baró
## 15
## Can Peguera
## 8
## Canyelles
## 34
## Ciutat Meridiana
## 11
## Diagonal Mar i el Front Marítim del Poblenou
## 146
## el Baix Guinardó
## 111
## el Barri Gòtic
## 178
## el Besòs i el Maresme
## 76
## el Bon Pastor
## 129
## el Camp d'en Grassot i Gràcia Nova
## 122
## el Camp de l'Arpa del Clot
## 172
## el Carmel
## 92
## el Clot
## 151
## el Coll
## 23
## el Congrés i els Indians
## 38
## el Fort Pienc
## 277
## el Guinardó
## 113
## el Parc i la Llacuna del Poblenou
## 184
## el Poble-sec
## 234
## el Poblenou
## 148
## el Putxet i el Farró
## 134
## el Raval
## 150
## el Turó de la Peira
## 24
## Horta
## 148
## Hostafrancs
## 125
## l'Antiga Esquerra de l'Eixample
## 588
## la Barceloneta
## 138
## la Bordeta
## 97
## la Clota
## 24
## la Dreta de l'Eixample
## 1167
## la Font d'en Fargues
## 42
## la Font de la Guatlla
## 46
## la Guineueta
## 52
## la Marina de Port
## 105
## la Marina del Prat Vermell
## 249
## la Maternitat i Sant Ramon
## 203
## la Nova Esquerra de l'Eixample
## 387
## la Prosperitat
## 73
## la Sagrada Família
## 377
## la Sagrera
## 70
## la Salut
## 58
## la Teixonera
## 31
## la Trinitat Nova
## 75
## la Trinitat Vella
## 68
## la Vall d'Hebron
## 64
## la Verneda i la Pau
## 77
## la Vila de Gràcia
## 178
## la Vila Olímpica del Poblenou
## 121
## les Corts
## 319
## les Roquetes
## 40
## les Tres Torres
## 161
## Montbau
## 51
## Navas
## 64
## Pedralbes
## 204
## Porta
## 98
## Provençals del Poblenou
## 181
## Sant Andreu
## 197
## Sant Antoni
## 233
## Sant Genís dels Agudells
## 52
## Sant Gervasi - Galvany
## 385
## Sant Gervasi - la Bonanova
## 209
## Sant Martí de Provençals
## 80
## Sant Pere, Santa Caterina i la Ribera
## 134
## Sants
## 177
## Sants - Badal
## 71
## Sarrià
## 189
## Torre Baró
## 7
## Unknown
## 27
## Vallbona
## 8
## Vallcarca i els Penitents
## 150
## Vallvidrera, el Tibidabo i les Planes
## 26
## Verdun
## 24
## Vilapicina i la Torre Llobeta
## 88
# Sin valores Unknown
summary(df_proc$Month)
## January February March April May June July August
## 844 824 935 845 963 908 918 652
## September October November December
## 769 928 991 762
# Sin valores Unknown
summary(df_proc$Day.Span)
## 1-5 6-10 11-15 16-20 21-25 26-31
## 1720 1640 1662 1825 1642 1850
# Sin valores Unknown
summary(df_proc$Hour.Span)
## 00-03h 03-06h 06-09h 09-12h 12-15h 15-18h 18-21h 21-00h
## 400 230 1082 1642 2081 1949 1906 1049
# Sin valores Unknown
summary(df_proc$Weekday)
## Monday Tuesday Wednesday Thursday Friday Saturday Sunday
## 1510 1691 1650 1677 1761 1155 895
# Sin valores Unknown
summary(df_proc$Weekday.Weekend)
## Weekday Weekend
## 8289 2050
# Revisamos variables numéricas
# Sin valores Unknown
summary(df_proc$Longitude)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.092 2.145 2.163 2.163 2.180 2.223
# Sin valores Unknown
summary(df_proc$Latitude)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 41.32 41.39 41.40 41.40 41.41 41.47
# Sin valores Unknown
summary(df_proc$Mild.injuries)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 1.000 1.154 1.000 10.000
# Sin valores Unknown
summary(df_proc$Serious.injuries)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.02331 0.00000 4.00000
# Sin valores Unknown
summary(df_proc$Victims)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 1.000 1.179 1.000 10.000
# Sin valores Unknown
summary(df_proc$Vehicles.involved)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 2.000 1.921 2.000 14.000
Aplicamos imputación knn para completar los valores a Unknown en “District.Name” y “Neighborhood.Name”.
df_proc[ df_proc == "Unknown" ] <- NA
colSums(is.na(df_proc))
## Id District.Name Neighborhood.Name Longitude
## 0 27 27 0
## Latitude Month Day.Span Hour.Span
## 0 0 0 0
## Weekday Weekday.Weekend Mild.injuries Serious.injuries
## 0 0 0 0
## Victims Vehicles.involved
## 0 0
head(df_proc)
## Id District.Name Neighborhood.Name Longitude Latitude
## 1 2017S008429 <NA> <NA> 2.125624 41.34004
## 2 2017S007316 <NA> <NA> 2.120452 41.33943
## 3 2017S010210 <NA> <NA> 2.167356 41.36089
## 4 2017S006364 <NA> <NA> 2.124529 41.33767
## 5 2017S004615 Sant Martí el Camp de l'Arpa del Clot 2.185272 41.41636
## 6 2017S007775 Sant Martí el Camp de l'Arpa del Clot 2.183245 41.41634
## Month Day.Span Hour.Span Weekday Weekday.Weekend Mild.injuries
## 1 October 11-15 06-09h Friday Weekday 2
## 2 September 1-5 12-15h Friday Weekday 2
## 3 December 6-10 21-00h Friday Weekday 5
## 4 July 21-25 00-03h Friday Weekday 1
## 5 May 21-25 12-15h Thursday Weekday 1
## 6 September 16-20 12-15h Wednesday Weekday 1
## Serious.injuries Victims Vehicles.involved
## 1 0 2 2
## 2 0 2 2
## 3 0 5 2
## 4 0 1 2
## 5 0 1 3
## 6 0 1 2
Ahora lanzaremos el modelo para que nos rellene el distrito y el barrio usando longitud y latitud:
df_proc <- kNN(df_proc,variable=c("District.Name","Neighborhood.Name"),k=1,dist_var=c("Longitude","Latitude"))
head(df_proc)
## Id District.Name Neighborhood.Name Longitude Latitude
## 1 2017S008429 Sants-Montjuïc la Marina del Prat Vermell 2.125624 41.34004
## 2 2017S007316 Sants-Montjuïc la Marina del Prat Vermell 2.120452 41.33943
## 3 2017S010210 Sants-Montjuïc la Marina del Prat Vermell 2.167356 41.36089
## 4 2017S006364 Sants-Montjuïc la Marina del Prat Vermell 2.124529 41.33767
## 5 2017S004615 Sant Martí el Camp de l'Arpa del Clot 2.185272 41.41636
## 6 2017S007775 Sant Martí el Camp de l'Arpa del Clot 2.183245 41.41634
## Month Day.Span Hour.Span Weekday Weekday.Weekend Mild.injuries
## 1 October 11-15 06-09h Friday Weekday 2
## 2 September 1-5 12-15h Friday Weekday 2
## 3 December 6-10 21-00h Friday Weekday 5
## 4 July 21-25 00-03h Friday Weekday 1
## 5 May 21-25 12-15h Thursday Weekday 1
## 6 September 16-20 12-15h Wednesday Weekday 1
## Serious.injuries Victims Vehicles.involved District.Name_imp
## 1 0 2 2 TRUE
## 2 0 2 2 TRUE
## 3 0 5 2 TRUE
## 4 0 1 2 TRUE
## 5 0 1 3 FALSE
## 6 0 1 2 FALSE
## Neighborhood.Name_imp
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 FALSE
## 6 FALSE
# Definimos atributos numéricos
var_num <- c("Longitude", "Latitude", "Mild.injuries", "Serious.injuries", "Victims", "Vehicles.involved")
# Vamos a realizar visualizaciones boxplot para estas variables
for (var in var_num){
main_text = paste("Boxplot Variable -", var)
ylab_text = var
boxplot(df_proc[,var],
main = main_text,
ylab = ylab_text,
col = "orange",
border = "brown",
horizontal = TRUE,
notch = TRUE
)
}
Una vez realizados todos los tratamientos, vamos a exportar el dataset transformado.
# Guardamos CSV de salida
write_csv(df_proc, 'accidents_2017_proc.csv')
# Agrupación por dia festivo o laborable
accidentes.laborable <- df_proc[df_proc$Weekday.Weekend == "Weekday",]
accidentes.festivo <- df_proc[df_proc$Weekday.Weekend == "Weekend",]
# Agrupación por dia laborable "normal" o viernes
accidentes.lun_vie <- df_proc[df_proc$Weekday %in% c("Monday", "Tuesday", "Wednesday", "Thursday"),]
accidentes.viernes <- df_proc[df_proc$Weekday == "Friday",]
# Agrupación por meses de verano o resto año
`%notin%` <- Negate(`%in%`)
accidentes.verano <- df_proc[df_proc$Month %in% c("July", "August"),]
accidentes.no_verano <- df_proc[df_proc$Month %notin% (c("July", "August")),]
# Agrupación por primeros o últimos dias del mes
accidentes.ppioMes <- df_proc[df_proc$Day.Span == "1-5",]
accidentes.finMes <- df_proc[df_proc$Day.Span == "26-31",]
# Agrupación por franjas horarias particulares
accidentes.madrugada <- df_proc[df_proc$Hour.Span %in% c("00-03h", "03-06h"),]
accidentes.primeraHora <- df_proc[df_proc$Hour.Span == "06-09h",]
accidentes.afterWork <- df_proc[df_proc$Hour.Span == "18-21h",]
accidentes.noche <- df_proc[df_proc$Hour.Span == "21-00h",]
accidentes.restoHoras <- df_proc[df_proc$Hour.Span %in% c("09-12h", "12-15h", "15-18h"),]
# En primer lugar, test Kolmogorov-Smirnov sobre variables numéricas
# Para Mild.injuries
ks.test(df_proc$Mild.injuries, pnorm, mean(df_proc$Mild.injuries), sd(df_proc$Mild.injuries))
## Warning in ks.test(df_proc$Mild.injuries, pnorm, mean(df_proc$Mild.injuries), :
## ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: df_proc$Mild.injuries
## D = 0.38844, p-value < 2.2e-16
## alternative hypothesis: two-sided
# Para Serious.injuries
ks.test(df_proc$Serious.injuries, pnorm, mean(df_proc$Serious.injuries), sd(df_proc$Serious.injuries))
## Warning in ks.test(df_proc$Serious.injuries, pnorm,
## mean(df_proc$Serious.injuries), : ties should not be present for the Kolmogorov-
## Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: df_proc$Serious.injuries
## D = 0.53501, p-value < 2.2e-16
## alternative hypothesis: two-sided
# Para Victims
ks.test(df_proc$Victims, pnorm, mean(df_proc$Victims), sd(df_proc$Victims))
## Warning in ks.test(df_proc$Victims, pnorm, mean(df_proc$Victims),
## sd(df_proc$Victims)): ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: df_proc$Victims
## D = 0.39753, p-value < 2.2e-16
## alternative hypothesis: two-sided
# Para Vehicles.involved
ks.test(df_proc$Vehicles.involved, pnorm, mean(df_proc$Vehicles.involved), sd(df_proc$Vehicles.involved))
## Warning in ks.test(df_proc$Vehicles.involved, pnorm,
## mean(df_proc$Vehicles.involved), : ties should not be present for the
## Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: df_proc$Vehicles.involved
## D = 0.35506, p-value < 2.2e-16
## alternative hypothesis: two-sided
# Adicionalmente, usando el test de Anderson-Darling,
# podemos ver todas las variables del dataset que no siguen una normal
alpha = 0.05
col.names = colnames(df_proc)
for (i in 1:ncol(df_proc)) {
if (i == 1) cat("Test de Anderson-Darling: lista de variables que no siguen una distribución normal:\n")
if (is.integer(df_proc[,i]) | is.numeric(df_proc[,i])) {
p_val = ad.test(df_proc[,i])$p.value
if (p_val < alpha) {
cat(col.names[i])
if (i < ncol(df_proc) - 1) cat(", ")
if (i %% 3 == 0) cat("\n")
}
}
}
## Test de Anderson-Darling: lista de variables que no siguen una distribución normal:
## Longitude, Latitude, Mild.injuries, Serious.injuries,
## Victims, Vehicles.involved,
##Análisis preliminar de los datos mediante gráficas/histogramas con sus atributos
# Histogramas de variables numéricas
hist(df_proc$Mild.injuries, col="lightblue",xlab="Heridos leves",
ylab="Número",main="Distribución de heridos leves")
hist(df_proc$Serious.injuries, col="lightblue",xlab="Heridos graves",
ylab="Número",main="Distribución de heridos graves")
hist(df_proc$Vehicles.involved, breaks=c(0,1,2,3,4,5,6,7,8,20), col="lightblue",xlab="Vehículos involucrados",
ylab="Número",main="Distribución de vehículos involucrados")
hist(df_proc$Victims, breaks=c(0,1,2,3,4,5,6,7,8,20), col="lightblue",xlab="Víctimas totales",
ylab="Número",main="Distribución de víctimas")
# Victimas por weekday
accidents_weekday <- with(df_proc, table(Victims,Weekday))
barplot(accidents_weekday,beside = TRUE, legend = TRUE,
main="Víctimas de accidentes por día de la semana",
xlab="Día de la semana",ylab="Número de víctimas")
# Histograma de atributos
ggplot(data = df_proc, aes(x = Weekday))+geom_bar(fill="lightblue",color="lightblue",aes(y = (..count..)))+
ggtitle("Accidentes por día de la semana",) + xlab("Día de la semana") + ylab("Número de accidentes")
ggplot(data = df_proc, aes(x = Month))+geom_bar(fill="lightblue",color="lightblue",aes(y = (..count..)))+
ggtitle("Accidentes por mes",) + xlab("Mes") + ylab("Número de accidentes")
ggplot(data = df_proc, aes(x = Hour.Span))+geom_bar(fill="lightblue",color="lightblue",aes(y = (..count..)))+
ggtitle("Accidentes por franja horaria",) + xlab("Franja horaria") + ylab("Número de accidentes")
ggplot(data = df_proc, aes(x = Day.Span))+geom_bar(fill="lightblue",color="lightblue",aes(y = (..count..)))+
ggtitle("Accidentes por momento del mes",) + xlab("Momento del mes") + ylab("Número de accidentes")
ggplot(data = df_proc, aes(x = Weekday.Weekend))+
geom_bar(fill="lightblue",color="lightblue",aes(y = (..count..)))+
ggtitle("Accidentes entre semana / fin de semana",) + xlab("Entre semana / fin de semana") +
ylab("Número de accidentes")
Identificar relación entre Meses/Días con mayor siniestralidad
# Nos quedamos con valores de interes
df_sin <- df_proc[c('Month','Day.Span','Victims')]
# Eliminamos los registros de accidentes sin victimas
df_sin <- subset(df_sin, Victims!=0)
# Observamos un primer histograma por número de accidentes
ggplot(df_sin,
aes(x = Month,
fill = Day.Span)) +
geom_bar(position = "fill") +
labs(y = "Percent",
title = "Distribución de accidentes con víctimas por Mes/Franja horaria") +
theme(axis.text.x = element_text(angle = 30))
ggplot(df_sin,
aes(x = Day.Span,
fill = Month)) +
geom_bar(position = "fill") +
labs(y = "Percent",
title = "Distribución de accidentes con víctimas por Franja horaria/Mes") +
theme(axis.text.x = element_text(angle = 30))
# Agrupamos el número de víctimas por cada Month y Day.Span único
df_sin <-
df_sin %>%
group_by(Month, Day.Span) %>%
dplyr::summarize(N_victims = n())
# Finalmente representamos gráficamente los datos obtenidos
Dias <- factor(df_sin$Day.Span)
qplot(Month, N_victims, data = df_sin, colour = Dias, size = I(5))+
labs(y = "Victims",
title = "Total víctimas por Mes/Franja días") +
theme(axis.text.x = element_text(angle = 30))
Vamos a revisar con el test Chi-Square:
# Revisamos nuestro dataframe de siniestralidad
summary(df_sin)
## Month Day.Span N_victims
## January : 6 1-5 :12 Min. : 57.0
## February: 6 6-10 :12 1st Qu.:112.5
## March : 6 11-15:12 Median :133.0
## April : 6 16-20:12 Mean :131.1
## May : 6 21-25:12 3rd Qu.:147.0
## June : 6 26-31:12 Max. :196.0
## (Other) :36
# Vamos a crear una variable categórica en función de N_victims
# La idea es aplicar posteriormente el test Chi-Square entre variables categóricas
# Hacemos representación boxplot y de densidad, para decidir como dividirla
boxplot(df_sin$N_victims,
main = "Boxplot de accidentes con siniestralidad",
xlab = "Victims",
col = "orange",
border = "brown",
horizontal = TRUE,
notch = TRUE)
ggplot(df_sin, aes(x=N_victims)) +
geom_histogram(aes(y=..density..),
binwidth=1, colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666") +
geom_vline(aes(xintercept=mean(N_victims, na.rm=T)),
color="red", linetype="dashed", size=1)+
labs(title = "Histograma de densidad de víctimas de accidentes")
# Dados los datos, y por simplificarlo, vamos a dividir el segmento 50-200 vícitmas
# en bins de 40 -> (50,89)(90,129)(130,169)(170,209)
df_sin$Victim.Span <- as.character(df_sin$N_victims)
df_sin[df_sin$N_victims %in% c(50:89),]$Victim.Span <- "Baja Siniestralidad"
df_sin[df_sin$N_victims %in% c(90:129),]$Victim.Span <- "Media Siniestralidad"
df_sin[df_sin$N_victims %in% c(130:169),]$Victim.Span <- "Alta Siniestralidad"
df_sin[df_sin$N_victims %in% c(170:209),]$Victim.Span <- "Muy Alta Siniestralidad"
df_sin$Victim.Span <- as.factor(df_sin$Victim.Span)
df_sin_chisq <- subset(df_sin, select = -c(N_victims))
# Ahora podemos aplicar la técnica Chi-Square para estudiar la posible relacion entre las variables
chisq.test(df_sin_chisq$Month,df_sin_chisq$Victim.Span)
## Warning in chisq.test(df_sin_chisq$Month, df_sin_chisq$Victim.Span): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: df_sin_chisq$Month and df_sin_chisq$Victim.Span
## X-squared = 44.488, df = 33, p-value = 0.08739
chisq.test(df_sin_chisq$Day.Span,df_sin_chisq$Victim.Span)
## Warning in chisq.test(df_sin_chisq$Day.Span, df_sin_chisq$Victim.Span): Chi-
## squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: df_sin_chisq$Day.Span and df_sin_chisq$Victim.Span
## X-squared = 9.4021, df = 15, p-value = 0.8556
Identificar si existe mayor siniestralidad a principios o finales de mes
# Aprovechamos lo definido anteriormente:
accidentes.ppioMes <- df_proc[df_proc$Day.Span == "1-5",]
accidentes.finMes <- df_proc[df_proc$Day.Span == "26-31",]
# Vamos a realizar el contraste de hipotesis para cada variable numérica que disponemos:
# Mild.injuries, Serious.injuries, Victims, Vehicles.involved
t.test(accidentes.ppioMes$Mild.injuries, accidentes.finMes$Mild.injuries)
##
## Welch Two Sample t-test
##
## data: accidentes.ppioMes$Mild.injuries and accidentes.finMes$Mild.injuries
## t = -0.83572, df = 3524.5, p-value = 0.4034
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.06943437 0.02793217
## sample estimates:
## mean of x mean of y
## 1.135465 1.156216
t.test(accidentes.ppioMes$Serious.injuries, accidentes.finMes$Serious.injuries)
##
## Welch Two Sample t-test
##
## data: accidentes.ppioMes$Serious.injuries and accidentes.finMes$Serious.injuries
## t = -0.061248, df = 3432.2, p-value = 0.9512
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.0120344 0.0113053
## sample estimates:
## mean of x mean of y
## 0.02558140 0.02594595
t.test(accidentes.ppioMes$Victims, accidentes.finMes$Victims)
##
## Welch Two Sample t-test
##
## data: accidentes.ppioMes$Victims and accidentes.finMes$Victims
## t = -0.82604, df = 3507.3, p-value = 0.4088
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.06899774 0.02809264
## sample estimates:
## mean of x mean of y
## 1.162791 1.183243
t.test(accidentes.ppioMes$Vehicles.involved, accidentes.finMes$Vehicles.involved)
##
## Welch Two Sample t-test
##
## data: accidentes.ppioMes$Vehicles.involved and accidentes.finMes$Vehicles.involved
## t = 0.2687, df = 3501.4, p-value = 0.7882
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.03989470 0.05256597
## sample estimates:
## mean of x mean of y
## 1.920930 1.914595
Identificar si existe correlación entre heridos graves en función de horario nocturno o diurno
# Agrupación por horas dia/noche
accidentes.noche <- df_proc[df_proc$Hour.Span %in% c("06-09h", "09-12h", "12-15h", "15-18h", "18-21h"),]
accidentes.dia <- df_proc[df_proc$Hour.Span %in% c("21-00h", "00-03h", "03-06h"),]
# Vamos a revisar lo planteado
t.test(accidentes.noche$Serious.injuries, accidentes.dia$Serious.injuries)
##
## Welch Two Sample t-test
##
## data: accidentes.noche$Serious.injuries and accidentes.dia$Serious.injuries
## t = -2.3696, df = 2116.6, p-value = 0.01789
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.021913837 -0.002067362
## sample estimates:
## mean of x mean of y
## 0.02136259 0.03335319
t.test(accidentes.noche$Victims, accidentes.dia$Victims)
##
## Welch Two Sample t-test
##
## data: accidentes.noche$Victims and accidentes.dia$Victims
## t = -0.21828, df = 2100.5, p-value = 0.8272
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.05009977 0.04006411
## sample estimates:
## mean of x mean of y
## 1.177829 1.182847
Identificar si existe la misma frecuencia de accidentes múltiples entre semana o fin de semana
# Creamos un dataset de accidentes múltiples (vehículos involucrados >1)
df_multi_acc <- df_raw[df_raw$Vehicles.involved>1,][c("Id","Weekday","Month","Day")]
# Creamos a continuación un dataset de frecuencia de accidentes múltiples diarios
df_multi_acc_freq <- count(df_multi_acc,vars=c("Weekday","Month","Day"))
acc.multi.semana <- df_multi_acc_freq[df_multi_acc_freq$Weekday %in% c("Monday","Tuesday","Wednesday","Thursday","Friday"),]
acc.multi.finde <- df_multi_acc_freq[df_multi_acc_freq$Weekday %in% c("Saturday","Sunday"),]
# Muestro la dimensión de los datasets para saber si puedo aplicar el Teorema Central del Límite
dim(acc.multi.finde)
## [1] 105 4
dim(acc.multi.semana)
## [1] 260 4
# Hago un hitograma superpuesto de ambos datasets para ver las distribuciones
hA <- hist(acc.multi.finde$freq, plot=FALSE)
hB <- hist(acc.multi.semana$freq, plot=FALSE)
c1 <- rgb(255,128,0,max = 255, alpha = 50, names = "orange")
c2 <- rgb(0,25,255, max = 255, alpha = 50, names = "blue")
plot(hA, col = c1, xlim = c(0,50),ylim = c(0,100), xlab= "Número de accidentes múltiples",
ylab="Frecuencia", main="Distribución de accidentes múltiples entre semana/fin de semana")
plot(hB, col = c2, add=TRUE)
legend(1, 95, legend=c("Fin de semana", "Entre semana"),
fill=c(c1, c2))
# Lanzo un var-test para ver si hay o no igualdad de varianza
var.test( acc.multi.finde$freq, acc.multi.semana$freq)
##
## F test to compare two variances
##
## data: acc.multi.finde$freq and acc.multi.semana$freq
## F = 0.5232, num df = 104, denom df = 259, p-value = 0.0001994
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.3830094 0.7309997
## sample estimates:
## ratio of variances
## 0.5231969
# Como no hay igualdad de varianza, lanzo un test t para dos muestras con varianzas diferentes
t.test(acc.multi.finde$freq, acc.multi.semana$freq, var.equal=FALSE)
##
## Welch Two Sample t-test
##
## data: acc.multi.finde$freq and acc.multi.semana$freq
## t = -16.025, df = 263.48, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -12.17019 -9.50673
## sample estimates:
## mean of x mean of y
## 14.40000 25.23846
Identificar si existe la misma frecuencia de accidentes dentro o fuera de periodo vacacional
# Creamos a continuación un dataset de frecuencia de accidentes diarios
accidentes.freq <- count(df_raw,vars=c("Month","Day"))
accidentes.verano.freq <- accidentes.freq[accidentes.freq$Month %in% c("July","August"),]
accidentes.no_verano.freq <- accidentes.freq[!(accidentes.freq$Month %in% c("July","August")),]
# Muestro la dimensión de los datasets para saber si puedo aplicar el Teorema Central del Límite
dim(accidentes.verano.freq)
## [1] 62 3
dim(accidentes.no_verano.freq)
## [1] 303 3
# Hago un hitograma superpuesto de ambos datasets para ver las distribuciones
hA <- hist(accidentes.verano.freq$freq, plot=FALSE)
hB <- hist(accidentes.no_verano.freq$freq, plot=FALSE)
c1 <- rgb(255,128,0,max = 255, alpha = 50, names = "orange")
c2 <- rgb(0,25,255, max = 255, alpha = 50, names = "blue")
plot(hA, col = c1, xlim = c(0,60),ylim = c(0,80), xlab= "Número de accidentes",
ylab="Frecuencia", main="Distribución de accidentes múltiples en vacaciones/no vacaciones")
plot(hB, col = c2, add=TRUE)
legend(1, 80, legend=c("Vacaciones", "Fuera de vacaciones"),
fill=c(c1, c2))
# Lanzo un var-test para ver si hay o no igualdad de varianza
var.test( accidentes.verano.freq$freq, accidentes.no_verano.freq$freq)
##
## F test to compare two variances
##
## data: accidentes.verano.freq$freq and accidentes.no_verano.freq$freq
## F = 1.016, num df = 61, denom df = 302, p-value = 0.9014
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.7044089 1.5427657
## sample estimates:
## ratio of variances
## 1.016001
# Como hay igualdad de varianza, lanzo un test t para dos muestras con varianzas iguales
t.test(accidentes.verano.freq$freq, accidentes.no_verano.freq$freq,var.equal = TRUE)
##
## Two Sample t-test
##
## data: accidentes.verano.freq$freq and accidentes.no_verano.freq$freq
## t = -2.6917, df = 363, p-value = 0.007438
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.2612752 -0.9747516
## sample estimates:
## mean of x mean of y
## 25.32258 28.94059
Identificar si existe correlación entre la hora y el número de accidentes
ggplot(data = df_raw, aes(x = Hour))+geom_bar(fill="lightblue",color="lightblue",aes(y = (..count..)))+
ggtitle("Accidentes por hora del día",) + xlab("Hora") + ylab("Número de accidentes")
ggplot(data = df_raw[df_raw$Weekday %in% c("Monday","Tuesday","Wednesday","Thursday","Friday"),],
aes(x = Hour))+geom_bar(fill="lightblue",color="lightblue",aes(y = (..count..)))+
ggtitle("Accidentes por hora del día entre semana",) + xlab("Hora") + ylab("Número de accidentes")
ggplot(data = df_raw[df_raw$Weekday %in% c("Saturday","Sunday"),],
aes(x = Hour))+geom_bar(fill="lightblue",color="lightblue",aes(y = (..count..)))+
ggtitle("Accidentes por hora del día en fin de semana",) + xlab("Hora") + ylab("Número de accidentes")
ggplot(data = df_raw[df_raw$Weekday %in% c("Monday","Tuesday","Wednesday","Thursday","Friday"),],
aes(x = Hour))+geom_bar(fill="lightblue",color="lightblue",aes(y = (..count..)))+
geom_bar(data = df_raw[df_raw$Weekday %in% c("Saturday","Sunday"),],fill="lightgrey",color="lightgrey",aes(y = (..count..)))+
theme(legend.position="right")+ggtitle("Accidentes por hora del día",) + xlab("Hora") + ylab("Número de accidentes")
En primer lugar visualizamos los puntos negros por volumen de accidentes.
# Copiamos datos
geo_accidents <- df_proc
# Agrupamos en 15 bins de longitud/latitud
geo_accidents <- transform(geo_accidents, bin_lon = cut(Longitude, 15))
geo_accidents <- transform(geo_accidents, bin_lat = cut(Latitude, 15))
geo_accidents <- geo_accidents %>%
group_by(bin_lon, bin_lat) %>%
dplyr::summarise(mean_lon=mean(Longitude),
mean_lat=mean(Latitude), n=n())
# Finalmente generamos el mapa reducido y el original
m = mapview(geo_accidents, xcol= "mean_lon", ycol="mean_lat", cex = "n",
crs = 4326, grid=FALSE, legend = FALSE)
m_full = mapview(df_proc, xcol= "Longitude", ycol="Latitude", crs = 4326,
grid=FALSE, layer.name='Datos Accidentes BCN 2017')
# Creamos .html standalone y fichero .png
mapshot(m, file = paste0(getwd(), "/map.png"))
mapshot(m, url = paste0(getwd(), "/map.html"), selfcontained = FALSE)
mapshot(m_full, url = paste0(getwd(), "/map_raw.html"), selfcontained = FALSE)
# Finalmente mostramos la imagen resultante
knitr::include_graphics("./map.png")
A continuación, vamos a visualizar un mapa únicamente con los accidentes con víctimas mortales.
# Copiamos datos únicamente para casos con victimas
geo_siniestralidad <- df_proc[df_proc$Victims>0,]
# Agrupamos en 30 bins de longitud/latitud, y sumamos las víctimas
geo_siniestralidad <- transform(geo_siniestralidad, bin_lon = cut(Longitude, 30))
geo_siniestralidad <- transform(geo_siniestralidad, bin_lat = cut(Latitude, 30))
geo_siniestralidad <- geo_siniestralidad %>%
group_by(bin_lon, bin_lat) %>%
dplyr::summarise(mean_lon=mean(Longitude),
mean_lat=mean(Latitude), n=sum(Victims))
# Finalmente generamos el mapa
m_victims = mapview(geo_siniestralidad, xcol= "mean_lon", ycol="mean_lat", cex = "n",
crs = 4326, grid=FALSE, layer.name='Accidentes con víctimas BCN 2017')
# Creamos .html standalone y fichero .png
mapshot(m_victims, file = paste0(getwd(), "/map_victims.png"))
mapshot(m_victims, url = paste0(getwd(), "/map_victims.html"), selfcontained = FALSE)
# Finalmente mostramos la imagen resultante
knitr::include_graphics("./map_victims.png")
A continuación, vamos a visualizar un mapa únicamente con los accidentes con múltiples vehiculos implicados.
# Copiamos datos únicamente para casos con victimas
geo_multipleAcc <- df_proc[df_proc$Vehicles.involved>1,]
# Agrupamos en 30 bins de longitud/latitud, y sumamos las víctimas
geo_multipleAcc <- transform(geo_multipleAcc, bin_lon = cut(Longitude, 30))
geo_multipleAcc <- transform(geo_multipleAcc, bin_lat = cut(Latitude, 30))
geo_multipleAcc <- geo_multipleAcc %>%
group_by(bin_lon, bin_lat) %>%
dplyr::summarise(mean_lon=mean(Longitude),
mean_lat=mean(Latitude), n=n())
# Finalmente generamos el mapa
m_multipleAcc = mapview(geo_multipleAcc, xcol= "mean_lon", ycol="mean_lat", cex = "n",
crs = 4326, grid=FALSE, layer.name='Accidentes con múltiples vehiculos implicados BCN 2017')
# Creamos .html standalone y fichero .png
mapshot(m_multipleAcc, file = paste0(getwd(), "/map_multipleAcc.png"))
mapshot(m_multipleAcc, url = paste0(getwd(), "/map_multipleAcc.html"), selfcontained = FALSE)
# Finalmente mostramos la imagen resultante
knitr::include_graphics("./map_multipleAcc.png")
Finalmente, vamos a visualizar un mapa para discernir accidentes diurnos y nocturnos.
# Copiamos datos únicamente para casos con victimas
geo_diaNoche <- df_proc
geo_diaNoche$DN <- 0
geo_diaNoche[geo_diaNoche$Hour.Span %in% c("06-09h", "09-12h", "12-15h", "15-18h", "18-21h"),17] <- 1
geo_diaNoche[geo_diaNoche$Hour.Span %in% c("21-00h", "00-03h", "03-06h"),17] <- 2
geo_dia <- df_proc[df_proc$Hour.Span %in% c("06-09h", "09-12h", "12-15h", "15-18h", "18-21h"),]
geo_noche <- df_proc[df_proc$Hour.Span %in% c("21-00h", "00-03h", "03-06h"),]
# Agrupamos en 30 bins de longitud/latitud, y sumamos las víctimas
geo_dia <- transform(geo_dia, bin_lon = cut(Longitude, 30))
geo_dia <- transform(geo_dia, bin_lat = cut(Latitude, 30))
geo_dia <- geo_dia %>%
group_by(bin_lon, bin_lat) %>%
dplyr::summarise(mean_lon=mean(Longitude),
mean_lat=mean(Latitude), n=n())
geo_noche <- transform(geo_noche, bin_lon = cut(Longitude, 30))
geo_noche <- transform(geo_noche, bin_lat = cut(Latitude, 30))
geo_noche <- geo_noche %>%
group_by(bin_lon, bin_lat) %>%
dplyr::summarise(mean_lon=mean(Longitude),
mean_lat=mean(Latitude), n=n())
# Finalmente generamos el mapa
m_diaNoche = mapview(geo_dia, xcol= "mean_lon", ycol="mean_lat", cex = "n", col.regions = 'blue',
alpha = 0.5,
crs = 4326, grid=FALSE, layer.name='Accidentes horario diurno BCN 2017') +
mapview(geo_noche, xcol= "mean_lon", ycol="mean_lat", cex = "n",
col.regions = 'grey', alpha = 0.5,
crs = 4326, grid=FALSE, layer.name='Accidentes horario nocturno BCN 2017')
# Creamos .html standalone y fichero .png
mapshot(m_diaNoche, file = paste0(getwd(), "/map_diaNoche.png"))
mapshot(m_diaNoche, url = paste0(getwd(), "/map_diaNoche.html"), selfcontained = FALSE)
# Finalmente mostramos la imagen resultante
knitr::include_graphics("./map_diaNoche.png")